Gradients

Logistic Function

$$ \begin{aligned} &\mathcal{L} = \ell(y, \sigma(z)) = y \cdot \log \frac{1}{\sigma(z)} + (1 - y) \cdot \log \frac{1}{1 - \sigma(z)} \\ &\begin{cases} \frac{\partial \mathcal{L}}{\partial \sigma} = \frac{1 - y}{1 - \sigma} - \frac{y}{\sigma} = \frac{\sigma - y}{\sigma \cdot (1 - \sigma)} \\ \frac{\partial \sigma}{\partial z} = \sigma \cdot (1 - \sigma) \\ \frac{\partial \mathcal{L}}{\partial z} = \frac{\partial \mathcal{L}}{\partial \sigma} \cdot \frac{\partial \sigma}{\partial z} = \sigma - y \\ \end{cases} \\ &\begin{cases} \frac{\partial \mathcal{L}}{\partial w} = \frac{\partial \mathcal{L}}{\partial z} \cdot \frac{\partial z}{\partial w} = [\sigma - y] \cdot x \\ \frac{\partial \mathcal{L}}{\partial b} = \frac{\partial \mathcal{L}}{\partial z} \cdot \frac{\partial z}{\partial b} = [\sigma - y] \cdot 1 \\ \end{cases} \end{aligned} $$

Softmax Function

$$ \begin{aligned} &\mathcal{L} = \ell(\underline{y}, \underline{\gamma}) = \sum_{c=1}^{C} y_c \cdot \log \frac{1}{\text{softmax}(\underline{\gamma})_{c}} \\ &\begin{cases} \text{softmax}(\underline{\gamma})_{c} &= \frac{e^{\gamma_c}}{\sum_{l=1}^{L} e^{\gamma_l}} \\ \frac{\partial \text{softmax}(\underline{\gamma})_{c}}{\partial \gamma_{c}} &= \frac{e^{\gamma_c} \cdot \left[ \sum_{l = 1}^{L} e^{\gamma_l} \right] - e^{\gamma_c} \cdot \left[ e^{\gamma_c} \right]}{\left[ \sum_{l = 1}^{L} e^{\gamma_l} \right]^2} \\ &= \frac{e^{\gamma_c}}{\left[ \sum_{l = 1}^{L} e^{\gamma_l} \right]} \cdot \frac{\left[ \sum_{l = 1}^{L} e^{\gamma_l} - e^{\gamma_c} \right]}{\left[ \sum_{l = 1}^{L} e^{\gamma_l} \right]} \\ &= \text{softmax}(\underline{\gamma})_{c} \cdot \left[ 1 - \text{softmax}(\underline{\gamma})_{c} \right] \\ \frac{\partial \text{softmax}(\underline{\gamma})_{j}}{\partial \gamma_{k (k \neq j)}} &= -\frac{e^{\gamma_{j}} \cdot e^{\gamma_{k}}}{\left[ \sum_{l = 1}^{L} e^{\gamma_l} \right]^2} \\ &= -\frac{e^{\gamma_{j}}}{\left[ \sum_{l = 1}^{L} e^{\gamma_l} \right]} \cdot \frac{e^{\gamma_{k}}}{\left[ \sum_{l = 1}^{L} e^{\gamma_l} \right]} \\ &= -\text{softmax}(\underline{\gamma})_{j} \cdot \text{softmax}(\underline{\gamma})_{k} \\ \frac{\partial \mathcal{L}}{\partial \gamma_{k}} &= - \left[ \sum_{c \neq k}^{C} y_c \cdot \frac{\partial \log \text{softmax}(\underline{\gamma})_{c}}{\partial \gamma_{k}} \right] - y_k \cdot \frac{\partial \log \text{softmax}(\underline{\gamma})_{k}}{\partial \gamma_{k}} \\ &= - \left[ \sum_{c \neq k}^{C} y_c \cdot \frac{\frac{\partial \text{softmax}(\underline{\gamma})_{c}}{\partial \gamma_{k}}}{\text{softmax}(\underline{\gamma})_{c}} \right] - y_k \cdot \frac{\frac{\partial \text{softmax}(\underline{\gamma})_{k}}{\partial \gamma_{k}}}{\text{softmax}(\underline{\gamma})_{k}} \\ &= \left[ \sum_{c \neq k}^{C} y_c \cdot \text{softmax}(\underline{\gamma})_{k} \right] - y_k \cdot \left[ 1 - \text{softmax}(\underline{\gamma})_{k} \right] \\ &= \left[ \sum_{c = 1}^{C} y_c \cdot \text{softmax}(\underline{\gamma})_{k} \right] - y_k \\ &= \text{softmax}(\underline{\gamma})_{k} \cdot \left[ \sum_{c = 1}^{C} y_c \right] - y_k \\ &= \text{softmax}(\underline{\gamma})_{k} - y_k \\ \end{cases} \\ &\begin{cases} \frac{\partial \mathcal{L}}{\partial w_{jk}} &= \frac{\partial \mathcal{L}}{\partial \gamma_{k}} \cdot \frac{\partial \gamma_{k}}{\partial w_{jk}} = \left[ \text{softmax}(\underline{\gamma})_{k} - y_k \right] \cdot z_{j} \\ \frac{\partial \mathcal{L}}{\partial w_{0k}} &= \frac{\partial \mathcal{L}}{\partial \gamma_{k}} \cdot \frac{\partial \gamma_{k}}{\partial w_{0k}} = \left[ \text{softmax}(\underline{\gamma})_{k} - y_k \right] \cdot 1 \end{cases} \end{aligned} $$

Hyperbolic Tangent Function

$$ \begin{aligned} \frac{d\tanh(x)}{dx} &= \frac{d\frac{\sinh(x)}{\cosh(x)}}{dx} = 1 - \tanh^2(x) \end{aligned} $$

by Jon